library(xtable)
library(gtools)

##survey

d <- read.csv("surveydata_clean_study1.csv")
wave <- 7
#investigate representativeness of participants left in wave 7

#order education levels
ed_levels <- c("Folkeskole / Grundskole",
               "Gymnasial uddannelse (Student, HF, HH, HTX og lign.)",
               "Erhvervsuddannelse",
               "Kort videregående uddannelse",
               "Mellemlang videregående uddannelse (Bachelor niveau)",
               "Lang videregående uddannelse (Kandidat niveau)")
d$education <- factor(d$education, levels=ed_levels)
ed_levels_eng <- c("Elementary", "High school", "Professional", "Short tertiary", "Bachelor","Master")
levels(d$education) <- ed_levels_eng

#create own age categories
age_midbreaks <- seq(30,70,10)
age_lowlims <- c(18,age_midbreaks,125)
d$age_cat10 <- cut(d$age, age_lowlims, include.lowest=T, right=F)
age_cats <- paste("Age", paste(c(18,age_midbreaks), c(age_midbreaks-1, "up"), sep="-"))
levels(d$age_cat10) <- age_cats

#keep only respondents who participated in wave
d_nonmiss <- d[!is.na(d[paste0("Weight.", wave)]),]

#for each background variable, add to a proportions table
backgrcols <- c("gender","age_cat10","region","education")
proptab_svy <- data.frame()
for(bgcol in backgrcols){

  proptab_one <- prop.table(table(d_nonmiss[bgcol]))
  proptab_one <- data.frame(proptab_one)
  names(proptab_one) <- c("category", "proportion")
  proptab_svy <- rbind(proptab_svy, proptab_one)
  
}


##Danmark Statistik

#gender
gender_age <- read.csv("DK Statistik/gender_age.csv")
gender_tab <- apply(gender_age[c("Mand","Kvinde")], 2, sum)
gender_prop <- gender_tab/sum(gender_tab)

#age
gender_age$age_cat <- cut(gender_age$alder, age_lowlims, include.lowest=T, right=F,
                          labels=age_cats)
age_tab <- tapply(gender_age$Total, gender_age$age_cat, sum)
age_prop <- age_tab/sum(age_tab)

#region
region <- read.csv("DK Statistik/region.csv")[c("region","total")]
region_prop <- region$total/sum(region$total)
names(region_prop) <- region$region

#education
education <- read.csv("DK Statistik/education.csv")[c("uddannelse","total")]
rownames(education) <- education$uddannelse

#sum H50 and H60, and H70 and H80
total_H50_H60 <- education["H50 Mellemlange videregående uddannelser, MVU","total"] +
  education["H60 Bacheloruddannelser, BACH","total"]
total_H70_H80 <- education["H70 Lange videregående uddannelser, LVU","total"] +
  education["H80 Ph.d. og forskeruddannelser","total"]
edu_joint <- data.frame(uddannelse=c("H50+H60 Bachelor", "H70+H80 Lang MA/PhD"),
                        total=c(total_H50_H60, total_H70_H80))
education <- rbind(education, edu_joint)

#drop levels that have been summed, and H90 unknown
dropeds <- c("H50 Mellemlange videregående uddannelser, MVU",
             "H60 Bacheloruddannelser, BACH",
             "H70 Lange videregående uddannelser, LVU",
             "H80 Ph.d. og forskeruddannelser", "H90 Uoplyst mv.")
education <- education[!(row.names(education) %in% dropeds),]
education <- education[order(education$uddannelse),]

#get proportions
edu_prop <- education$total/sum(education$total)
names(edu_prop) <- ed_levels_eng

#add background variables into a proportions table
proptab_DK <- data.frame(category = c(names(gender_prop),
                                      names(age_prop),
                                      names(region_prop),
                                      names(edu_prop)),
                         proportion = c(gender_prop,
                                        age_prop,
                                        region_prop,
                                        edu_prop))


##merge survey and DK Statistik

proptab <- merge(proptab_svy, proptab_DK, by="category", sort = F)
proptab$percent_survey <- round(proptab$proportion.x*100, 1)
proptab$percent_population <- round(proptab$proportion.y*100, 1)
proptab$difference <- proptab$percent_survey - proptab$percent_population
proptab <- proptab[c("category","percent_survey","percent_population","difference")]

#further level translations
proptab$category <- as.character(proptab$category)
proptab$category[proptab$category == "Kvinde"] <- "Female"
proptab$category[proptab$category == "Mand"] <- "Male"
proptab$category[proptab$category == "Hovedstaden"] <- "Capital region"

#write fo files
write.csv(proptab, paste0("tables/demographic_proportions_wave", wave, ".csv"))
colnames(proptab)[2:3] <- c("pct. in survey","pct. in population")
proptabx <- xtable(proptab, digits=c(0,0,1,1,1),
               caption = 'Comparing proportions of Study 1 respondents in demographic categories to Danish population statistics (source: Danmark Statistik, Q1 2022).',
               label= "tab:represent")
print(proptabx, file=paste0("tables/demographic_proportions_wave", wave, ".tex"),
      hline.after = c(-1,0,2,8,13,19), include.rownames=F)